section .data
    ; no initialized data to keep binary tiny

section .bss
    trailmap resb 256*128
    rowbuf resb 256

    ; every particle is 4 bytes in RAM: posx, posy, velx, vely
    particles resb 256*4



section .text
    org 0x100             ; Origin for .COM files

start:
    ; Set video mode to 13h (320x200)
    mov ax, 0x0013
    int 0x10

    call F_clear_trailmap

    ; Set our custom ISR for int 0x1c
    mov ax, 0x2500 + 0x1c
    lea dx, I_main
    int 0x21

    push ds
    pop es
    mov di, particles

    mov cx, 4*256
    L_test_fill:
        imul ax, 1337
        add ax, 7331
        stosb
        loop L_test_fill

hlt

F_clear_trailmap:
    push ds
    pop es

    mov di, trailmap
    xor al, al
    mov cx, 256*128

    L_clear_fill:
        stosb
        loop L_clear_fill

    ret

F_move_particle:
    mov cx, 256
    mov si, particles
    L_move_particle:

        mov ax, [si+2] ; loads velx into al and vely into ah

        sar al, 5 ; velocity is a 2.6 fixed-point number
        sar ah, 5

        add al, [si]
        add ah, [si+1]

        mov [si], ax ; stores al into posx and ah into posy

        add si, 4
        loop L_move_particle
    ret


F_sense_particle:
    mov cx, 256
    mov si, particles
    L_sense_particle:
        push cx


        mov ax, [si+2] ; loads velx into al and vely into ah

        sar al, 3 ; velocity is a 2.6 fixed-point number, we need a multiple of it here
        sar ah, 3

        mov bh, al
        mov bl, 0
        sub bl, ah

        sar bl, 1 ; sideways half-length ax
        sar bh, 1

        mov cx, [si] ; sensor position, starting with particle pos
        add cl, al
        add ch, ah

        add cl, bl
        add ch, bh

        push si

        mov si, cx
        mov dl, [si]


        ; look the other way
        sub cl, bl
        sub ch, bh
        sub cl, bl
        sub ch, bh


        mov si, cx
        mov dh, [si]

        pop si


        ; dl is the trail sample left, dh is the trail sample right


        mov cx, [si+2] ; loads velx into cl vely into ch

        cmp dl, dh
        jg B_turn_left
        jl B_turn_right
        jmp B_turn_end



        B_turn_left:
            add cl, bl
            add ch, bh
            jmp B_turn_end

        B_turn_right:
            sub cl, bl
            sub ch, bh

        B_turn_end:


        mov [si+2], cx ; stores cl into velx and ch into vely

        add si, 4
        pop cx
        loop L_sense_particle
    ret

F_deposit:
    mov cx, 256
    mov di, particles
    L_deposit:
        mov ax, [di] ; loads posx into al and posy into ah
        and ah, 0x7F ; there are only 128 rows
        mov si, trailmap
        add si, ax ; the 256 columns make indexing neat
        ;add byte [ds:si], 242
        mov byte [ds:si], 0xFF

        add di, 4
        loop L_deposit

    ret

F_horizontal_blur:
    ; DS:SI will point to trailmap
    ; ES:DI will point to rowbuf


    ; Set ES to DS for the entire operation since both trailmap and rowbuf reside in the same segment
    mov ax, ds
    mov es, ax


    ; Copy from trailmap to rowbuf
    lea si, [trailmap]

    mov cx, 128 ; 128 rows
    L_blur_row:
        push cx

        push word 1 ; TODO: set to 256 for vertical blur

        push si
        call F_copy_row_to_buf ; takes si as param, pointing to a row in trailmap, resets di
        pop si

        xchg si, di
        call F_blur_rowbuf ; takes di as param, pointing to a row in rowbuf, resets si
        xchg si, di

        pop cx ; unused

        pop cx
        loop L_blur_row

    ret


F_copy_row_to_buf:
    lea di, [rowbuf]
    mov cx, 256       ; 256 bytes to copy
    copy_to_rowbuf_loop:
        lodsb              ; Load byte from trailmap to AL and increment SI
        stosb              ; Store AL to rowbuf and increment DI
        dec si
        mov bp, sp
        add si, word [bp+4] ; 4 because caller also has si saved
        loop copy_to_rowbuf_loop

    ret


F_blur_rowbuf:

    mov cx, 256          ; Loop counter for 256 elements
    mov si, 0            ; SI will be our index

    rowbuf_loop:
        ; Compute prev index
        mov bx, si
        dec bx
        and bx, 0xFF

        ; Load previous value
        mov al, [rowbuf + bx]
        mov ah, 0            ; Clear high byte of ax
        mov dx, ax           ; Copy previous value to dx

        ; Load current value (times 2)
        mov al, [rowbuf + si]
        shl ax, 1            ; Multiply by 2
        add dx, ax           ; Add to our sum in dx

        ; Compute next index
        mov bx, si
        inc bx
        and bx, 0xFF

        ; Add next value
        mov al, [rowbuf + bx]
        mov ah, 0            ; Clear high byte of ax
        add dx, ax           ; Copy previous value to dx

        ; Shift right by 2
        shr dx, 2
        mov [di], dl ; Store the result back

        inc si                ; Move to the next index

        mov bp, sp
        add di, word [bp+2]

        loop rowbuf_loop      ; Decrement cx and loop if not zero

    ret





;; F_decay:
;;     push ds
;;     pop es
;;
;;     mov di, trailmap
;;     mov si, trailmap
;;     mov cx, 256*128
;;
;;     L_decay_fill:
;;         lodsb
;;         mov ah, al  ; copy the value
;;         shr ah, 5   ; divide by 32
;;         sub al, ah  ; subtract the quotient from the original value
;;         stosb
;;         loop L_decay_fill
;;
;;     ret


I_main: ; called at 18.2 Hz by timer interrupt

    call F_horizontal_blur
    call F_sense_particle
    call F_move_particle
    call F_deposit
    ;call F_decay


;;;;;;;; copy trail map to screen ;;;;;;;;

    ; Set ES to video memory segment
    mov ax, 0xA000
    mov es, ax

    ; Calculate offset in video memory to center the bitmap
    ; (320-256)/2 = 32 pixels horizontally
    ; (200-128)/2 = 36 pixels vertically
    ; 1 pixel = 1 byte in this mode, so:
    ; Horizontal offset = 96
    ; Vertical offset = 36 * 320 (because of the width of the screen)
    mov di, (36*320)+32   ; Destination offset in video memory

    ; Set DS:SI to the source bitmap data
    lea si, trailmap

    ; Copy loop
    mov cx, 128           ; Number of rows

    L_copy_row:
        push cx
        mov cx, 256           ; Number of bytes per row to copy

        L_copy_pixel:
            ; Copy byte from DS:SI to ES:DI
            lodsb                  ; Load byte from DS:SI into AL, increment SI

            shr al, 4
            add al, 16 ; 32 for rainbow

            stosb                  ; Store byte from AL to ES:DI, increment DI
            loop L_copy_pixel

        ; Add offset to DI to jump to the next line
        add di, 320-256        ; 320 (screen width) - 128 (bitmap width)
        pop cx
        loop L_copy_row

iret ; main is an interrupt handler, so we must return with IRET


